clear all
set more off

* set directory here

/* This do file takes three data sets to generate the variables used for 
analysis related to IVR lesson completion (figures and Table 5).
1. IVR lesson attendance sheets collected from HH (svyIVR_attendance_raw)
2. IVR quiz completion & repeated records from Telco server (IVR_quiz)
3. Sample list that contains IDs and treatment variables (Sample)
The .dta generated is named IVR_Lessons_Data.dta that can be merged with the
other data to produce the final data set used for all tables in the main text
*/

***** Import IVR survey attendance sheet data with IVR quiz completion data ****
pwd 
import delimited using "svyIVR_attendance_raw.csv", varnames(1) case(preserve) clear
sort CHILD_ID
preserve

import delimited using "IVR_quiz.csv", varnames(1) case(preserve) clear
sort CHILD_ID
tempfile quiz
save `quiz'.dta, replace
********************************************************************************

***** Merge the IVR survey attendance sheet and IVR quiz complete data *********
restore
merge 1:1 CHILD_ID using `quiz'.dta
tab _merge
drop _merge
********************************************************************************


***************** Generate key variables used in data analysis *****************
* recode matchphone = 0 if missing
replace matchphone = 0 if matchphone==.

* fix lessons completed using both IVR quiz completion and survey attendance
forvalues i = 1(1)30 {
	gen complete_lit_`i' = (Lite_Less_`i'==1 | (complete_quiz_lit_`i'>=1 & complete_quiz_lit_`i'~=.))
	replace complete_lit_`i' = . if (Lite_Less_`i'==.)
	
	gen complete_num_`i' = (Num_Less_`i'==1 | (complete_quiz_num_`i'>=1 & complete_quiz_num_`i'~=.))
	replace complete_num_`i' = . if (Num_Less_`i'==.)
	
	gen repeat_lit_`i' = (repeat_quiz_lit_`i'==1)
	replace repeat_lit_`i' = . if (Lite_Less_`i'==.)
	
	gen repeat_num_`i' = (repeat_quiz_num_`i'==1)
	replace repeat_num_`i' = . if (Num_Less_`i'==.)
}

forvalues i = 1(1)15 {
	gen complete_lead_`i' = (Lead_Less_`i'==1)
	replace complete_lead_`i' = . if (Lead_Less_`i'==.)
}

* generate svyIVR_type variable
gen svyIVR_type = 1
replace svyIVR_type = 2 if blank==1

* keep key variables only
keep CHILD_ID treatment_arms svy_missing svyIVR_type evercomp_* matchphone ///
	complete_lit_* complete_num_* complete_lead_* repeat_lit_* repeat_num_*
order CHILD_ID treatment_arms svy_missing svyIVR_type evercomp_* matchphone ///
	complete_lit_* complete_num_* complete_lead_* repeat_lit_* repeat_num_*
	
*save intermediary file to merge later
sort CHILD_ID
tempfile quiz
save `quiz'.dta, replace
********************************************************************************

******* merge with full list to fix missing values and generate new vars *******
* The full sample of HHs
cd ..
use "Sample.dta", clear
cd "3. Lesson Data"

sort CHILD_ID
merge 1:1 CHILD_ID using `quiz'.dta
drop _merge

** recode variables
replace svy_missing = 1 if  svy_missing==.

replace svyIVR_type = 3 if svy_missing==1 & treatment_arms~=3

replace svyIVR_type = 4 if treatment_arms==3

replace matchphone = 0 if matchphone==.

** generate total completed lesson variables
egen totlessons_lit = rowtotal(complete_lit_*)
replace totlessons_lit = . if svyIVR~=1
replace totlessons_lit = . if svyIVR==1 & treatment_arm~=3 & evercomp_lit==1 & complete_lit_1==.
replace totlessons_lit = . if svyIVR==1 & treatment_arm~=3 & evercomp_lit==0 & complete_lit_1==.

egen totlessons_num = rowtotal(complete_num_*)
replace totlessons_num = . if svyIVR~=1
replace totlessons_num = . if svyIVR==1 & treatment_arm~=3 & evercomp_num==1 & complete_num_1==.
replace totlessons_num = . if svyIVR==1 & treatment_arm~=3 & evercomp_num==0 & complete_num_1==.

egen totlessons_lead = rowtotal(complete_lead_*)
replace totlessons_lead = . if svyIVR~=1
replace totlessons_lead = . if svyIVR==1 & treatment_arm~=3 & evercomp_lead==1 & complete_lead_1==.
replace totlessons_lead = . if svyIVR==1 & treatment_arm~=3 & evercomp_lead==0 & complete_lead_1==.
replace totlessons_lead = . if treatment_arm~=2

* save intermediary file for generating sequence variables
sort CHILD_ID
tempfile completion
save `completion'.dta, replace
********************************************************************************

************************* generate sequence variables **************************
reshape long complete_lit_ complete_num_, i(CHILD_ID) j(number)
rename complete_lit_ lit_module
rename complete_num_ num_module

* Clean literacy module to further code in sequence or not
preserve
drop if (lit_module==0 | lit_module==.)

* Coding different types of skipping for those who skipped some
sort CHILD_ID number
by CHILD_ID: gen temp_sequence_dummy = (number[_n] == (number[_n-1] + 1))
by CHILD_ID: replace temp_sequence_dummy = 1 if _n==1
by CHILD_ID: egen tot_temp_sequence_dummy = total(temp_sequence_dummy)
gen sequence_lit = (tot_temp_sequence_dummy==totlessons_lit)
by CHILD_ID: gen first = _n
keep if first==1
keep CHILD_ID sequence
sort CHILD_ID
tempfile seqlit
save `seqlit'.dta, replace

* Clean numeracy module to further code skipping and in sequence or not
restore
preserve
drop if (num_module==0 | num_module==.)

* Coding different types of skipping for those who skipped some
sort CHILD_ID number
by CHILD_ID: gen temp_sequence_dummy = (number[_n] == (number[_n-1] + 1))
by CHILD_ID: replace temp_sequence_dummy = 1 if _n==1
by CHILD_ID: egen tot_temp_sequence_dummy = total(temp_sequence_dummy)
gen sequence_num = (tot_temp_sequence_dummy==totlessons_num)
by CHILD_ID: gen first = _n
keep if first==1
keep CHILD_ID sequence
sort CHILD_ID
tempfile seqnum
save `seqnum'.dta, replace

* Make data one observation per child *****
restore
by CHILD_ID: gen first = _n
keep if first==1
sort CHILD_ID
merge m:1 CHILD_ID using `seqlit'.dta, gen(merge_lit)
drop merge_lit

sort CHILD_ID
merge m:1 CHILD_ID using `seqnum'.dta, gen(merge_num)
drop merge_num

sort CHILD_ID
tempfile usage
save `usage'.dta, replace

* keep only sequence variables
keep CHILD_ID treatment_arms sequence_*

* Save sequence data set
sort CHILD_ID
tempfile sequence
save `sequence'.dta, replace
********************************************************************************


*********** Put completion and sequence variables together *********************
use `completion'.dta, clear
sort CHILD_ID
merge 1:1 CHILD_ID using `sequence'.dta
tab _merge
drop _merge

*********** generate remaining total and repeat variables **********************
gen totlessons_tot = totlessons_lit + totlessons_num

egen rep_totlessons_lit = rowtotal(repeat_lit_*)
replace rep_totlessons_lit = . if svyIVR~=1
replace rep_totlessons_lit = . if svyIVR==1 & treatment_arm~=3 & evercomp_lit==1 & complete_lit_1==.
replace rep_totlessons_lit = . if svyIVR==1 & treatment_arm~=3 & evercomp_lit==0 & complete_lit_1==.

egen rep_totlessons_num = rowtotal(repeat_num_*)
replace rep_totlessons_num = . if svyIVR~=1
replace rep_totlessons_num = . if svyIVR==1 & treatment_arm~=3 & evercomp_num==1 & complete_num_1==.
replace rep_totlessons_num = . if svyIVR==1 & treatment_arm~=3 & evercomp_num==0 & complete_num_1==.

gen rep_totlessons_tot = rep_totlessons_lit + rep_totlessons_num

foreach v in lit num tot {
	gen reppct_`v'_t = (rep_totlessons_`v'/totlessons_`v')
	gen reppct_`v' = reppct_`v'_t*100
	drop reppct_`v'_t
}

***** generate different groups of total lessons listened for graphing use *****
gen cbin_lit = 0 if treatment_arm==3
replace cbin_lit = 1 if (totlessons_lit>=1 & totlessons_lit<=19)
replace cbin_lit = 2 if (totlessons_lit>=20 & totlessons_lit<=29)
replace cbin_lit = 3 if (totlessons_lit==30)
replace cbin_lit = 4 if (evercomp_lit==1 & treatment_arm~=3 & totlessons_lit==.)

gen cbin_num = 0 if treatment_arm==3
replace cbin_num = 1 if (totlessons_num>=1 & totlessons_num<=19)
replace cbin_num = 2 if (totlessons_num>=20 & totlessons_num<=29)
replace cbin_num = 3 if (totlessons_num==30)
replace cbin_num = 4 if (evercomp_num==1 & treatment_arm~=3 & totlessons_num==.)

gen cbin_tot = 0 if treatment_arm==3
replace cbin_tot = 1 if (totlessons_tot>=1 & totlessons_tot<=38)
replace cbin_tot = 2 if (totlessons_tot>=39 & totlessons_tot<=59)
replace cbin_tot = 3 if (totlessons_tot==60)
replace cbin_tot = 4 if (evercomp_lit==1 | evercomp_num==1) & treatment_arm~=3 & totlessons_tot==.


****************************** Label variables ******************************
** label survey attendance sheet variables
lab var svy_missing "Did not return IVR survey"
lab var svyIVR_type "Type of IVR survey data"
lab var evercomp_lit "Ever accessed/completed literacy"
lab var evercomp_num "Ever accessed/completed numeracy"
lab var evercomp_lead "Ever accessed/completed leadership"

lab def lblsvy 1 "Info given" 2 "Blank sheet" 3 "Did not return" 4 "Control group", replace
lab val svyIVR_type lblsvy

** label combined attendance and quiz completion variables
forvalues i = 1(1)30 {
	lab var complete_lit_`i' "Completed literacy lesson #`i'"
	lab var complete_num_`i' "Completed numeracy lesson #`i'"
	lab var repeat_lit_`i' "Ever repeated literacy lesson #`i'"
	lab var repeat_num_`i' "Ever repeated numeracy lesson #`i'"
}
forvalues i = 1(1)15 {
	lab var complete_lead_`i' "Completed leadership lesson #`i'"
}

** label matchphone indicator
lab var matchphone "Phone matches with IVR data"

** label total, sequence, and repetition variables
lab var totlessons_lit "Number of literacy lessons completed"
lab var totlessons_num "Number of numeracy lessons completed"
lab var totlessons_tot "Number of literacy & numeracy lessons completed"
lab var totlessons_lead "Number of leadership lessons completed"

lab var rep_totlessons_lit "Number of literacy lessons repeated"
lab var rep_totlessons_num "Number of numeracy lessons repeated"
lab var rep_totlessons_tot "Number of literacy & numeracy lessons repeated"

lab var reppct_lit "Pct literacy completed lessons ever repeated"
lab var reppct_num "Pct numeracy completed lessons ever repeated"
lab var reppct_tot "Pct literacy & numeracy lessons ever repeated"

lab var sequence_lit "Follow literacy sequence"
lab var sequence_num "Follow numeracy sequence"

** label bin variables for graphing use
lab var cbin_lit "Literacy lesson completed type"
lab var cbin_num "Numeracy lesson completed type"
lab var cbin_tot "Literacy & numeracy lesson completed type"

lab def lblcbin_lit 0 "Never accessed" 1 "1-19" 2 "20-29" 3 "30" 4 "accessed some", replace
lab def lblcbin_num 0 "Never accessed" 1 "1-19" 2 "20-29" 3 "30" 4 "accessed some", replace
lab def lblcbin_tot 0 "Never accessed" 1 "1-38" 2 "39-59" 3 "60" 4 "accessed some", replace

lab val cbin_lit lblcbin_lit
lab val cbin_num lblcbin_num
lab val cbin_tot lblcbin_tot

* save file
sort CHILD_ID
save "IVR_Lessons_Data.dta", replace